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Appendix D 



/*/////////////////////////////////////////////////////////////////////////// 

// FILE: phnspell.cpp 

// CREATED: 2 -Jan- 96 

// AUTHOR: Charles Ingold 

// DESCRIPTION: Pron spelling and frequency table class. 

// 

// Copyright (C) Dragon Systems, 1995-1996 
// DRAGON SYSTEMS CONFIDENTIAL 

// 

// Revision history log 

VSS revision history. Do not edit by hand. 
$Log: /PQ/prons/phnspell . cpp $ 

3 3/25/97 16:21 Chuck 

PHONEQUERY Ver 0.01.172 

removed old- tree-pro j ect only SDWord_ { Set , Get } Frequency calls to build 
under MREC. This means that for now we don't rescale the one-grams in 
the pron guesser. 

*///////////////////////////////////////////////////////////// 
#include "stdafx.h" 

//#include "trec.h" 

//#include "myassert . h" 

//#include "cutil.h" 

//#include "sdapi .h" 

//#include "apputil.h" 

//#include "ckapi .h" 

#if 0 

#ifdef TREC 

extern SDInteger SDWord_Get Frequency ( SDhVoc hVoc, SDhWord hWord ) ; 
extern void SDWord_SetFrequency ( SDhVoc hVoc, SDhWord hWord, SDInteger 
freq ) ; 

#endif 

#define TOTAL_VOC_FREQ 200000000 

#endif 

#include <ctype.h> 

#include "phnspell . h" 
/ / #include "results . h" 

GlobalParDef ( gParUsePhnSpellLM, "phnspell/UsePhnSpellLM" , E_INT # 1, 

"1 means use the frequencies from the PhnSpellArray to build 
pron-networks \n " 

■ "0 means use the same frequency for all words in a given SDState 
in pron-networks . " ) ; 

DEF_ERR{ PhnSpell, 1, 

"PhnSpell_ReadAscii () format error line = <%s>" ) ; // l %s line 
DEF_ERR( PhnSpell, 2, 

"PhnSpell_ReadAscii () file must be sorted in ascending order on 

spelling . \n" 

" %s < %s\n") ; 
DEF_ERR( PhnSpell, 3, 

"PhnSpell Datafile Version %d unknown. Try version %d."); 
DEF_ERR( PhnSpell, 4, 

"PhnSpell data corrupted."); 
DEF_ERR( PhnSpell, 5, 

"Read from disk has failed."); 
DEF_ERR( PhnSpell, 6, 
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"Write to disk has failed."); 
DEF_ERR( PhnSpell, 7, // 1 %c unknown char 2 %s spelling 

"No known pronunciation for character '%c' in spelling '%s'."); 



#define PHN_S PELL_ARRAY_F I LE_VERS ION 1 
#if o 

////////////////////////////////////////////////////////////////////////// 

// Helper func for getGuessRule, rescales the frequencies for words in 

hState 

// by scaleFactor 
// 

iifdef TREC 

void rescaleWordFreqs (SDhVoc hVoc, SDhState hState, SDInteger 
scaleFactor) 

{ 

SDhWordlterator hSWIter = 
SDState_IterateWords (hVoc, hState, SDHC0LL_NOC0LLATI0N, 
SD_WORD_NORESTRICTION / » " ) ; 

SDInteger freq = 0; 
SDInteger newFreq = 0; 

SDhWord hWord = SDWordJSText (hSWIter) ; 

while ( hWord != 0 ) 
{ 

freq = SDWord_Get Frequency (hVoc , hWord) ; 
newFreq = freq * scaleFactor; 
SDWord_SetFrequency (hVoc, hWord, newFreq) ; 
hWord = SDWord_Next (hSWIter) ; 

SDWord_EndIteration (hSWIter) ; 

#endif 

#endif 

/////////////////////////////////////////////////////////////// 
// Populate hGuessState with words based on prons for pPhnSpell. 
void PhnSpell Array: rgetGuessWords (SDhVoc hScratchVoc, 
// SDhState hScratchParentState , 

SDhState hGuessState ; 
char *szGuessStateName l 
SDInteger *pTotalFreq, 
PhnSpell *pPhnSpell) { 

/// Create SDhWord items for each pron 
PronOf f set *pPronOf f set = getOf f setPronO ( pPhnSpell ) ; 
SDWordLMInfo wordLMInf o ; 

wordLMInfo .version = CURRENT_LM INF0_VERS I ON ; 
wordLMInf o . nUnigramCount= 1; 

stmcpyl (char *) wordLMInf o . cat egoryName , "XX", 3) ; 

SDInteger bestFreq = 0; 
PronOf f set *pBestPronOf f set = NULL; 

// Get the set of phonetic string fragments for each 

// substring which matches the string from this position 

while ( pPronOffset ) { 
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// force double 0-byte termination 
int lenPronBuf = 2 + strlen( getPron( pPronOffset ) ); 
char *pPronBuf = DgnNewArray ( char , lenPronBuf) ; 
memset (pPronBuf , 0, lenPronBuf); 

strncpy (pPronBuf , getPron( pPronOffset ) , lenPronBuf - 2) ; 
// pPronBuf [ lenPronBuf - 2 ] = ' ' ; 

SDhWord hWord = 0; 

int wordLen = strlen (pPronBuf ) + strlen (szGuessStateName) + 2 / 
char *pWordName= DgnNewArray (char , wordLen); 
memset (pWordName, 0, wordLen); 

sprintf (pWordName+strlen (pWordName) , "%s\\%s M , szGuessStateName , 
pPronBuf) ; 

CHK_SDAPI (hWord = SDWord_GetHandle (hScratchVoc, pWordName) ); 
if ( hWord == 0 ) { 

SDInteger freq = getFrequency ( pPronOffset ) ; 
if ( freq > bestFreq) { 
bestFreq = freq; 
pBestPronOf f set= pPronOffset; 

if ( (int) gParUsePhnSpellLM == 0 ) { 
freq = 1 ; 

} 

/* #ifdef TREC 

CHK_SDAPI ( hWord = S D Wo rd JSTe w ( hScratchVoc, pPronBuf ) ); 
assert (hWord) ; 

SDWord_Set Frequency ( hScratchVoc, hWord, freq) ; 



#else 
*/ 



// #endif 



wordLMInfo.nUnigramCount = freq; 

CHK_SDAPI ( hWord = SDWord__NewWithLMInf o (hScratchVoc , 

pWordName , &wordLMInf o ) 



pPronBuf [ lenPronBuf - 2 ] = 0; 

CHK_SDAPI ( SDWord_SetPronunciations ( hScratchVoc, hWord, 

(unsigned char*) pPronBuf) ) ; 

pTotalFreq += freq; 
^ CHK_SDAPI ( SDState_AddWord( hScratchVoc, hGuessState, hWord) ) 

pPronOffset = getOf f setNextPron (pPronOffset ) ; 
DgnDeleteArray (pPronBuf ) ; 
^ DgnDeleteArray (pWordName) ; 

#if o 

#ifdef TREC 

// rescale the frequencies for the words in the state 
SDInteger f reqScaleFactor = TOTAL_VOC_FREQ / pTotalFreq; 
rescaleWordFreqs (hScratchVoc, hGuessState, f reqScaleFactor) ; 
#endif 

#endif 

// xDumpState (hScratchVoc, phStateArray [ spellLen ]); 

// Record the best pron based on unigram scores 
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CHK_SDAPI ( SDhEnv hStateEnv = SDState_AccessEnv ( hScratchVoc, 
hGuessState, SDENV_ADDTO ) ) ; 

CHK_SDAPI( SDEnv_SetData (hStateEnv, "bestPron" , 
getPron (pBestPronOf f set) , strlen (getPron (pBestPronOf f set) ) +1) ) ; 

CHK_SDAPI( SDEnv_SetData (hStateEnv, "bestFreq" , fcbestFreq, 
sizeof (SDInteger) ) ) ; 

} 

//We use alternates for states, rather than words, so 
//we need to know the number of different length spelling entries 
// which match the unknown word. For now, we just add unreeded 
// start/end alternative operations. 

// Fill in phStateArray with states for partial matches on szSpelling 
void PhnSpellArray : : getGuessStates (SDhVoc hScratchVoc, 

SDhState hScratchParentState , 

SDhState *phStateArray, 

const char *szSpelling) 

const int lenUnknown = strlen (szSpelling) ; 
char *pStateName = DgnNe wAr ray (char , 2 * (lenUnknown + 1)+ 3 ); 
SDInteger *totalFreqArray = DgnNewArray( SDInteger, lenUnknown + 1 ) ; 
memset ( total FreqArray, 0, (lenUnknown+1) * sizeof (SDInteger) ); 
int nSpellMatches = 0; 
PhnSpell *pPhnSpell = NULL; 

for (pPhnSpell = firstPhnSpellMatch (szSpelling) ; 
pPhnSpell != 0; 

pPhnSpell = nextPhnSpellMatch( pPhnSpell ) ) { 
nSpellMatches++ ; 

int spellLen = wcslen ( pPhnSpell ); 

// From here, we transition spellLen-many chars further into 
// word, and we had better land on either another char in 
// unknown, or the terminal node 
assert (spellLen <= lenUnknown) ; 

/// Does state exist? Create state if necessary 
memmove (pStateName, szSpelling, spellLen) ; 
pStateName [spellLen] = 0 ; 
CHK_SDAPI( phStateArray [spellLen] = SDState_Get Handle (hScratchVoc , 
pStateName, 

hScratchParentState) ) ; 

if (phStateArray [spellLen] == 0) { 

CHK_SDAPI( SDhState hNewState = SDState_New (hScratchVoc, 
hScratchParentState) ) ; 

assert ( hNewState) ; 

CHK_SDAPI( SDState_SetLMAllowed (hScratchVoc, hNewState, 1) ); 
CHK_SDAPI ( SDState_SetName (hScratchVoc , hNewState , 

pStateName) ) ; 

phStateArray [spellLen] = hNewState; 

getGuessWords (hScratchVoc, /* hScratchParentState ,*/ 
phStateArray [spellLen] , 

pStateName, total FreqArray + spellLen, 

pPhnSpell) ; 

} 

} // End loop over phnSpell matches 
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if (pStateName) 

DgnDeleteArray (pStateName) ; 



} 



if (totalFreqArray) 

DgnDeleteArray (totalFreqArray) ; 

if (nSpellMatches == 0) { 

^ errThrow( USE_ERR ( PhnSpell, 7), pPhnSpell [0] , pPhnSpell ); 



////////////////////////////////////////////////////////////////// 
// 

// PhnSpellArray : :getGuessRule () 

// 

// Description: 

// Create a state machine to guess a pron for szSpelling. 

// Return: 

// The SDhRule for the first character in pWord. 
// 

// We create an array of Ruleltems which looks like: 

// 

// RULE <string> 

// { StartAlternates } 

// [StartSequence] 

// STATE "prefix" 

// RULE "suffix" 

/ / [EndSequence] Frequency 

// { EndALT } 

// 

// The <> are mandatory. We have both {} items or neither of them, and they 
// may contain a number of Start/EndSequence items. We have all [] items or 
// none of them. 

// How many rule items for each rule? 

// Estimate 2 + (4*nWords) : StartAlt + nWords* (StartSeq Word Rule EndSeq) + 
EndAlt 

// since most words are in a sequence which references another rule 

// Build a pron network for szSpelling. do this by buildix'7 a rule for each 

position in the unknown str. Each 

// of these rules consists of a list of alternates. Each alternate 

// consists of a sequence of phonetic fragment and an optional reference 

//to substring of unknown which matches a 

// phoneme/spelling fragment in the PhnSpellArray . Each rule 

// FUTURE: deal with unpronounced chars in a more principled way 

SDhRule PhnSpellArray : : getGuessRule (SDhVoc hScratchVoc, 

SDhState hScratchParentState , 
const char *szRuleName, 
const char *szSpelling) 



assert (szSpelling) ; 

SDhRule hRetRule = 0; 

int lenUnknown = strlen (szSpelling) ; 

assert (lenUnknown > 0) ; 

char *unknown = DgnNewArray( char, lenUnknown + 1 ) ; 
assert (unknown) ; 

for (int i = 0; i < lenUnknown; i++) { 
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unknown [i] = CAST ( char, tolower (szSpelling [i] ) ); 

unknown [i] = 0 ; 

SDhRule hNextRule = 0; 
char pRName [100] = ">'; 

/// Create a RULE for each position in the unknown word, 

/// process from last char to first, so we know the rule handles. 

SDhRule *RulesArray= DgnNewArray ( SDhRule, lenUnknown ) ; 

RuleltemArray ruleltemArray; 
SDRuleltem ruleltem; 

memset (&ruleltem, 0, sizeof (SDRuleltem) ) ; 

SDhState *hStateArray= DgnNewArray ( SDhState, lenUnknown + 1 ); 
int strOffset; 

for (strOf f set=lenUnknown-l; strOffset >= 0; strOffset--) 

{ 

// Build a rule for the remainder of the word (pSubStr) . 
char *pSubStr = unknown + strOffset; 
ruleltemArray. removeAll () ; 

CHK_SDAPI ( hNextRule = SDRule_GetHandle ( 
hScratchVoc , hScratchParentState , pSubStr) ) ; 
if (hNextRule != 0) { 

RulesArray [strOffset] = hNextRule; 
continue ; 

} 

// Insert Start Op Alt at beginning of description, 
ruleltem. type= SD_RULE_STARTOPERATION ; 

ruleltem. f requency= 0 ; 
ruleltem. hVoc= hScratchVoc; 

ruleltem. value . operation=SD_RULE_OPERATION_ALTERNATIVE ; 
ruleltemArray . add (ruleltem, 0) ; 

// Create a state and description for each partial match on the 
spelling . 

int nSpellMatches = 0; 

memset (hStateArray, 0, (lenUnknown+1) * sizeof (SDhState) ); 

getGuessStates (hScratchVoc , hScratchParentState , 
hStateArray, pSubStr) ; 

SDInteger bestFreq = 0; 
SDhRule hBestRule =0; 
SDhState hBestState=0 ; 

for (int spellLen = 0; strOffset + spellLen <= lenUnknown; spellLen++) 

{ 

if (hStateArray [spellLen] == 0) { 
continue ; 

} 

SDhState hCurrentState = hStateArray [ spellLen ] ; 
SDhRule hCurrentRule = 0 ; 
SDInteger stateFreq = 0; 
SDInteger ruleFreq = 0; 
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CHKjSDAPI ( SDhEnv hStateEnv = SDState_AccessEnv (hScratchVoc , 
hCurrentState, SDENV_EXISTING) ); 

CHK_SDAPI( SDEnv_GetData (hStateEnv, "bestFreq" , fcstateFreq, 
sizeof (SDInteger) ) ) ; 

// Add a description for the new state 
if (strOffset + spellLen < lenUnknown) { // We go to another 
char in unknown 

hCurrentRule = RulesArray [strOffset + spellLen] ; 



// Add StartOperationSequence item 
ruleltem.type= SD_RULE_S TARTOPERAT ION ; 
ruleltem. frequency= 0; // pPhnSpell- >getFreq ( ) ; 
ruleItem.hVoc= hScratchVoc; 

ruleltem . value . opera tion=SD_RULE_OPERATION_SEQUENCE ; 
ruleltemArray. add (ruleltem) ; 

// Add State item 

ruleltem . type =SD_RULE_ STATE ; 

ruleltem. frequency= 0; 

ruleltem. hVoc= hScratchVoc; 

ruleltem . value . hState= hCurrentState ; 

ruleltemArray. add (ruleltem) ; 

// Add Rule item for next rule 
ruleltem. type=SD_RULE_RULE ; 
ruleltem. f requency=0 ; 
ruleltem. hVoc= hScratchVoc; 
ruleltem. value .hRule = hCurrentRule; 
ruleltemArray . add (ruleltem) ; 

// Add EndOperationSequence item 
ruleltem. type = SD_RULE_ENDOPERAT ION ; 
ruleltem. frequency=0; 
ruleltem. hVoc= hScratchVoc; 

ruleltem. value . operation=SD_RULE_OPERATION_SEQUENCE ; 
ruleltemArray. add (ruleltem) ; 

CHK_SDAPI ( SDhEnv hRuleEnv = 
SDRule_AccessEnv (hScratchVoc, hCurrentRule, SDENV_EXISTING) ) ; 

CHK__SDAPI( SDEnv_GetData (hRuleEnv, "bestFreq" , 
&ruleFreq, sizeof (SDInteger) ) ) ; 

} else { //We goto to terminal node from here 

ruleltem. type=SD_RULE_STATE; 
ruleltem . f requency= 0 ; 
ruleltem. hVoc= hScratchVoc; 
ruleltem . value . hState= hCurrentState ; 
ruleltemArray. add (ruleltem) ; 



if (stateFreq + ruleFreq > bestFreq) { 
bestFreq = stateFreq + ruleFreq; 
hBestRule = hCurrentRule; 
hBestState = hCurrentState; 
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rul eltem . t ype = SD_RULE_ENDOPERAT I ON ; 
ruleltem. f requency=0 ; 
ruleItem.hVoc= hScratchVoc; 

ruleltem . value . opera tion=SD_RULE_OPERATION_ALTERNATIVE ; 
ruleltemArray. add (ruleltem) ; — 

/ / Add the new rule to the voc 

SDRuleltem *pRuleItems = ruleltemArray . getData () ; 
int nl terns = ruleltemArray . count () ; 

CHK_SDAPI( SDhRule hNewRule= SDRule_New (hScratchVoc , 
hScratchParentState) ) ; 

CHK_SDAPI( SDRule_SetDescription (hScratchVoc, hNewRule, pRuleltems, 
nit ems) ) ; 

CHK_SDAPI ( SDRule SetName (hScratchVoc , hNewRule, unknown + strOffset) 

) ; 

assert (hNewRule) ; 

RulesArray [strOffset] = hNewRule; 
// xDumpRule (hScratchVoc, hRule) ; 

/// Set the bestPron env var for the new rule 
int rulePronLen = 50; 
int statePronLen = 50; 

char *pBestRulePron = DgnNewArray ( char , rulePronLen) ; 
memset (pBestRulePron, 0, rulePronLen) ; 

char *pBestStatePron = DgnNewArray (char , statePronLen) ; 

if ( hBestRule ) { 

CHK_SDAPI ( SDhEnv hRuleEnv = SDRule_AccessEnv (hScratchVoc , 
hBestRule, SDENV_EXISTING) ) ; 

CHK_SDAPI ( SDEnv_GetData (hRuleEnv, "bestPron", 
pBestRulePron, rulePronLen) ) ; 

} 

CHK_SDAPI ( SDhEnv hStateEnv = SDState_AccessEnv {hScratchVoc , 
hBestState, SDENV_EXI STING) ) ; 

CHK_SDAPI ( SDEnv_GetData (hStateEnv, "bestPron", pBestStatePron, 
statePronLen) ) ; 

/ / Record the best pron based on unigram scores 

int bestPronLen = rulePronLen + statePronLen + 1 ; 

char* pNewBestPron= DgnNewArray (char , bestPronLen); 

sprintf (pNewBestPron, "%s%s", pBestStatePron, pBestRulePron); 

CHK_SDAPI ( SDhEnv hRuleEnv = SDRule__Ac cess Env (hScratchVoc , 
hNewRule, SDENV_ADDTO) ) ; 

CHK_SDAPI( SDEnv_SetData (hRuleEnv, "bestFreq", &bestFreq, 
sizeof (SDInteger) ) ) ; 

CHK_SDAPI ( SDEnv_SetData (hRuleEnv, "bestPron", pNewBestPron, 
strlen(pNewBestPron) +1) ) ; 

} // End loop over chars in unknown word 

if (hStateArray) 

DgnDeleteArray (hStateArray) ; 

if (unknown) 

DgnDeleteArray (unknown) ; 

if (RulesArray) { 

hRetRule = RulesArray [0] ; 

CHK_SDAPI( SDRule_SetName (hScratchVoc, hRetRule, s.:*V_uleName) ); 
DgnDeleteArray (RulesArray) ; 

return hRetRule; 
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} 

//////////////////////////////////////////////////////////////////////////// 

// Set mpWCTarget Spell to be a Unicode copy of pTargetSpell and 

// return a pointer to PhnSpell entry for the first character in pTargetSpell 

PhnSpell * PhnSpellArray : : f irstPhnSpellMatch ( const char *pTargetSpell ) { 

size_t targetSize = strlen (pTargetSpell) + 1; 
if ( mnWCTargetSpellSize < targetSize ) { 
DgnDeleteArray (mpWCTarget Spell) ; 

mpWCTargetSpell = DgnNewArray ( wchar_t, targetSize ); 
memset (mpWCTarget Spell, 0, (targetSize) *sizeof (wchar_t) ) ; 
mnWCTargetSpellSize = (unsl6) targetSize; 

assert ( (uns32) mnWCTargetSpellSize == (uns32) targetSize); 

} 

mbstowcs (mpWCTargetSpell, pTargetSpell, targetSize) ; 
// get the offset for the first char 

WideCharOf f set wideCharOf f set (mpWCTargetSpell [0] , 0) ; 
int pos; 

int retVal = mWCOff setTable , find (wideCharOf f set , WideCharOf fsetCmp, 

&pos) ; 

if( retVal != -1 ) { 

PhnSpell *pPhnSpell = mpPhnSpellData + mWCOff setTable [pos] . getOff set () ; 
return pPhnSpell; 

return 0; 

} 

//////////////////////////////////////////////////////////////////////////// 

// Find the entry for the first character in pWCTargetSpell , which has to be 
// the first match for the target spelling. 

PhnSpell * PhnSpellArray : : f irstPhnSpellMatch ( wchar_t *pWCTargetSpell) { 
size_t targetLen = wcslen (pWCTargetSpell) ; 

wcscpy (mpWCTargetSpell, pWCTargetSpell); // BUG check length 

// get the offset for the first char 

WideCharOf f set wideCharOf f set (mpWCTargetSpell [0] , 0) ; 
int pos; 

int retVal = mWCOff setTable . find (wideCharOf f set , WidrCharOff setCmp , 

&pos) ; 

if( retVal != -1 ) { 

PhnSpell *pPhnSpell = mpPhnSpellData + mWCOff setTable [pos] .getOff set () ; 
^ return pPhnSpell; 

return 0; 



//////////////////////////////////////////////////////////////////////////// 

// Return the next offset in mpPhnSpellData which is the start of a 

// spelling 

// 

PhnSpell * PhnSpellArray: : next Spelling (PhnSpell *pPhnSpell ) { 
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assert ( ckPhnSpellPtr ( pPhnSpell ) ); 

while (*pPhnSpell != PHNSPELL_END_OF_ENTRY) { 
pPhnSpell++; 

pPhnSpell++; 

if (*pPhnSpell == PHNS PELL_END_OF_ENTRY ) // Check for end of PhnSpell 

data 

return 0; 

assert ( ckPhnSpellPtr ( pPhnSpell ) ); 
return pPhnSpell; 

//////////////////////////////////////////////////////////////////////////// 

// Find the next entry in mpPhnSpellData which is a partial match to 

// mpWCTargetSpell, if there is one. 

// 

PhnSpell * PhnSpell Array : mextPhnSpellMatch ( PhnSpell *pPhnSpell ) { 

assert (mpWCTargetSpell) ; 

assert ( ckPhnSpellPtr ( pPhnSpell ) ); 

PhnSpell *pNextPhnSpell = pPhnSpell; 
int cmpVal = 0; 

for( ;; ) { 

pNextPhnSpell = nextSpelling ( pNextPhnSpell ); 
if ( pNextPhnSpell == 0 ) 
return NULL; 

cmpVal = wcsncmp (mpWCTargetSpell, pNextPhnSpell, 
wcslen (pNextPhnSpell) ) ; 

if (cmpVal == 0) 

return pNextPhnSpell; 

// Note that we don't stop until pSpell is > mpWCTargetSpell 
// because if we are looking for "at", we may find "al" first 
if (cmpVal < 0) 
return NULL; 

} 



//////////////////////////////////////////////////////////////////////////// 

// Get the frequency which is in the unsl6 following PhnSpell *, 

inline SDInteger PhnSpell Array: : get Frequency (PhnSpell *pPhnSpell ) { 
assert ( ckPhnSpellPtr ( pPhnSpell ) ); 

return (unsl6) * (pPhnSpell +1) ; 

//////////////////////////////////////////////////////////////////////////// 
// Get the offset for the first pron for the spelling at phnSpellOf f set 
PronOffset * PhnSpell Array : :getOf f setPronO ( PhnSpell *pPhnSpell ) { 
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assert ( ckPhnSpellPtr ( pPhnSpell ) ); 

while (*pPhnSpell) { // advance to the NULL- terminator for the 
spelling 

pPhnSpell++; 

} 

pPhnSpell++; // advance to the pron offset 

assert ( *pPhnSpell != PHNS PELL__END_OF_ENTRY ); 

assert ( ckPhnSpellPtr ( pPhnSpell ) ); 

return pPhnSpell; 

//////////////////////////////////////////////////////////////////////////// 

// Return the offset into mpPronData which is in the unsl6 

// Note that we don't change phnSpellOf f set , because getOf f setNextPron ( ) 

// expects to skip over a frequency 

// 

PronOffset *PhnSpe 11 Array: :getOff setNextPron ( PronOffset *pPronOffset ) 

assert ( ckPronOf f setPtr ( pPronOffset ) )/ 
pPronOf f set++; // advance to frequency 
pPronOf f set++; // advance to next pron offset 
assert ( ckPronOf f setPtr ( pPronOffset ) ); 

if (*pPronOf f set == PHNS PELL_END_OF_ENTRY) 

return 0; 
return pPronOffset; 



//////////////////////////////////////////////////////////////////////////// 

// Return a pointer to the pron located at the offset in mpPronData which 
// we find at the offset in mpPhnSpellData given by phnSpellOf f set . 

char *PhnSpellArray: rgetPron (PronOffset *pPronOffset ) 

assert ( ckPronOff setPtr ( pPronOffset ) ); 

if ( *pPronOffset == PHNS PELL_END_OF_ENTRY ) 
return 0 ; 



} 



char *pRet = mpPronData + (*pPronOf f set) ; 
return pRet; 



//////////////////////////////////////////////////////////////////////////// 

// Writes mpPronData, mpPhnSpellData, and mWCOf f setTable into a binary file 
// for persistant storage and access by readBinaryFile ( ) . 

void PhnSpe 11 Array: : writeBinaryFile (FILE *pOutFile) { 

unsl6 fileVersion = PHN_S PELL_ARRAY_F I LE_VERS I ON ; 
// write the file format version. 

if (1 != fwrite (&f ileVersion, sizeof (unsl6 ) , 1, pOutFile) ) { 
errThrow(USE_ERR(PhnSpell / 6) ) ; 
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} 

if (1 != f write (&mnEntries, sizeof (unsl6) , 1, pOutFile) ) { 
errThrow (USE_ERR (PhnSpell , 6) ) ; 



if (1 != fwrite (&mTotalFrequency, sizeof <uns32) , l, pOutFile) ) { 
errThrow (USE_ERR (PhnSpell, 6) ) ; 

// write the size of the PhnSpellData 

if (1 != f write UmPhnSpellDataSize, sizeof (PhnSpellC f£ set) , 1, pOutFile) 
errThrow (USE_ERR{ PhnSpell, 6) ) ; 



// write the PhnSpellData 
if ( mPhnSpellDataSize != 

f write (mpPhnSpellData, sizeof (unsl6) , mPhnSpellDataSize , 

pOutFile) ) { 

errThrow (USE_ERR (PhnSpell, 6) ) ; 



// write the size of the PronData 

if (1 != f write ( &mnProriDataSize , sizeof (PronOff set ) , 1, pOutFile) ) { 
^ errThrow (USE_ERR (PhnSpell, 6) ) ; 

// write the PronData 
if (mnPronDataSize != 

fwrite (mpPronData, sizeof (char) , mnPronDataSize, pOutFile) ) 

errThrow (USE_ERR (PhnSpell, 6) ) ; 

// write the number of elements in the WideCharOf f setTable 
int32 nWCOffsets = mWCOff setTable . count () ; 

if (1 != fwrite UnWCOff sets, sizeof (int32) , 1, pOutFile) ) { 
errThrow (USE_ERR (PhnSpell , 6) ) ; 



// write the WideCharOf f setTable 

void *pEntry = mWCOff setTable . first () ; 

if (nWCOffsets != 

CAST(int32, fwrite (pEntry, sizeof (WideCharOf f set ) , 
nWCOffsets, pOutFile)) ) { 

errThrow (USE_ERR (PhnSpell, 6) ) ; 



//////////////////////////////////////////////////////////////////////////// 

// Reads a binary file written by writeBinaryFile ( ) into mpPronData, 

// mpPhnSpellData, and mWCOff setTable . 

// 

void PhnSpellArray : ireadBinaryFile (FILE *pInFile) { 

unsl6 fileVersion; 

// read the file format version. 

if (1 != freadUfileVersion, sizeof (unsl6) , 1, plnFile) ) { 
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errThrow (USE_ERR (PhnSpell , 5) ) ; 



if (fileVersion != PHN_S PE LL_ARRAY_F I LE_VER S I ON ) { 
errThrow (USE_ERR (PhnSpell, 3) , fileVersion, 
PHN SPELL ARRAY FILE VERSION) ; 
" } " " " 

if (1 != freadUmnEntries, sizeof (unsl6) , 1, plnFile) ) { 
errThrow ( US EJERR (PhnSpell , 5) ) ; 



if (1 != f read (&mTotal Frequency, sizeof (uns32 ) , 1, plnFile) ) { 
errThrow (USE_ERR (PhnSpell, 5) ) ; 



// read the size of the PhnSpellData 

if (1 != fread(&mPhnSpellDataSize, sizeof (PhnSpellOff set) , 1, plnFile) ) 
^ errThrow (USE_ERR (PhnSpell, 5) ) ; 

// read the PhnSpellData 

mpPhnSpellData = DgnNewArray ( unsl6, mPhnSpellDataSize ); 
if (mPhnSpellDataSize 1= 

f read(mpPhnSpellData, sizeof (unsl6) , mPhnSpellDataSize, plnFile) ) 

errThrow (USE_ERR (PhnSpell , 5) ) ; 



// read the size of the PronData 

if (1 != fread(&mnPronDataSize, sizeof (PronOff set ) , 1, plnFile) ) { 
errThrow (USE_ERR (PhnSpell, 5) ) ; 



// read the PronData 

mpPronData = DgnNewArray ( char, mnPronDataSize ) ; 

if (mnPronDataSize 1= f read (mpPronData, sizeof (char) , mnPronDataSize, 
plnFile) ) { 

errThrow (USE_ERR (PhnSpell, 5) ) ; 



// read the number of elements in the WideCharOf f setTable 
int32 nWCOffsets; 

if (1 != fread(&nWCOf fsets, sizeof (int32) , 1, plnFile) ) { 
^ errThrow (USE_ERR (PhnSpell, 5) ) ; 

mWCOff setTable . setSize (nWCOffsets) ; 

// read the WideCharOf f setTable 
// Future: find & use READ_MANY (trecutil) 
if (nWCOffsets != 

CAST(int32, f read (mWCOff setTable . getData ( ) , 
sizeof (WideCharOf f set) , nWCOffsets, plnFile)) ) { 
^ errThrow ( US E_ERR (PhnSpell , 5) ) ; 

// verifyDataO ; 
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#if 0 

int PhnSpell Array: : verifyData ( ) 

// check that mpPhnSpellData is in incr alpha order on spellings 

// check (0 < pron offsets <= PronTable size) 
// check mWCOf f setTable offsets 
errThrow( USE_ERR ( PhnSpell, 4) ); 

#endif 



//////////////////////////////////////////////////////////////////////////// 

// Compare func for use with mWCOff setTable which is a DgivACo 
// 

int WideCharOf f setCmp (const void *given, const void *test) { 
assert (given) ; 
assert (test) ; 

int result = wcsncmp { ( (WideCharOf f set *) given) - >getChar () , 

( (WideCharOf f set *) test) ->getChar () , 
1 ) ; 

return result; 

} 



//////////////////////////////////////////////////////////////////////////// 

// Compare func for use with PronOf f setTbl which is a DgnACo 

// 

int PronOf fsetEntryCmp (const void *given, const void *test) 

assert (given) ; 
assert (test) ; 

int result = strcmp( ( (PronOf fsetEntry *) given) ->mpStr, 

( (PronOf fsetEntry *) test) ->mpStr) ; 

return result; 

} 

const unsl6 DATA_GROWTH_S I ZE = 1024; 

//////////////////////////////////////////////////////////////////////////// 

// looks up pData, which is a pron, in PronOf f setTbl . If it is found, then 
// the offset for the pron is in the PronOf fsetEntry , otherwise we add an 
entry 

// to both mpPronData, growing it if necessary, and also add another 
PronOf fsetEntry to 
// PronOf f setTbl. 

// 

// Note that PronOf f setTbl is only around while we are in readAsciiO . 
// 

PronOf f set PronOf f setTbl : :getOff set (char *pData) { 
assert (pData) ; 
int searchResult ; 

// See if we need to grow the DataBlock 
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size_t dataLen = strlen (pData) +1 ; 

if (mCurrentOf f set + dataLen > mnDataSize) { 

char *pNewBuf = DgnNe wAr ray (char , mnDataSize DATA_GROWTH_SIZE ) ; 
memcpy (pNewBuf , mpDataBlock, mnDataSize) ; 
DgnDeleteArray (mpDataBlock) ; 
mpDataBlock = pNewBuf; 
mnDataSize += DATA_GROWTH_S I ZE ; 



// Figure out offset of data in mpDataBlock 
strcpy (mpDataBlock + mCurrentOf f set , pData) ; 

PronOf f setEntry *pPronOff setEntry = DgnNew( PronOf f setEntry ) ; 
pPronOf f setEntry->init (mpDataBlock, mCurrentOf f set ) ; 
int posData; 

searchResult = f ind (pPronOff setEntry , PronOf f set EntryCmp, &posData) ; 

// Have we seen pData before? 
if (searchResult == -1) { 

// No, update mCurrentOf f set and pronOf f set Array . 

uns32 newOffset = mCurrentOf f set + dataLen; 

assert (newOffset < Oxff ff ) ; 

mCurrentOf f set = (PronOf f set) newOffset; // char after null, 
add (pPronOff setEntry, posData); 

} else { 

// Yes, so we have the offset and don't need the Off setEntry 
memset (mpDataBlock + mCurrentOf f set , 0, dataLen); 
DgnDelete (pPronOff setEntry) ; 
pPronOff setEntry = (*this) [searchResult] ; 

return pPronOff setEntry- >mOf f set ; 



//////////////////////////////////////////////////////////////////////////// 

// build our data structures in memory for mpPronData, mpPhnSpellData, and 
// mWCOf f setTable from an Ascii file of spellings, prons and frequencies. 

// For now, ASCII file must be in ascending alphabetic order on the spelling, 
void PhnSpellArray : :readAscii (FILE * pDataFile) { 

assert (pDataFile) ; 

PronOf fsetTbl *pPronTable = DgnNew (PronOf f set Tbl) ; 

PhnSpellOf f set nDataBlockSize = 2048; 

unslG *pDataBlock = DgnNew ( unsl6 [ nDataBlockSize ]) ; 
memset (pDataBlock, 0, nDataBlockSize) ; 
pDataBlock [0] = PHNS PELL_END_OF_ENTRY ; 
unsl6 *pCurPos = pDataBlock + 1; 

char linebuf [512] = » " ; 
char phonemes [128] = " " ; 
char spelling [128] = " " ; 
char prevSpelling [128] = " 11 ; 
unsigned int frequency = 0; 
int numfields; 
for ( ; ; ) 

{ 

PhnSpellOf f set curOffset = CAST (PhnSpellOf f set , pCurPos - pDataBlock); 
// do we need to grow the data block? 
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// In this iteration, we may add as many bytes as 
// 2*strlen (spelling) + 2 bytes for O-terminator 
// + 2 for pronOffset + 2 for frequency + 2 for entry terminator. 
// so as long as strlen (spelling) is less than 40, ve're fine... 
// As of 8/1/96, the longest spelling was bearly into double - digi ts . 
if (curOffset + 100 > nDataBlockSize) 

PhnSpellOf fset newSize = (unsl6) (nDataBlockSize + 

DATA_GROWTH_SIZE) ; 

assert ( (uns32) newSize == (uns32) (nDataBlockSize + 
DATA_GROWTH_S I ZE ) ) ; 

unsl6 *pNewBlock = DgnNew (unsl6 [ newSize ]); 

memcpy( pNewBlock, pDataBlock, curOf f set*sizeof (unsl6 ) ); 

nDataBlockSize = newSize; 

DgnDeleteArray (pDataBlock) ; 

pDataBlock = pNewBlock; 

pCurPos = pDataBlock + curOffset; 

if (fgets (linebuf , sizeof (linebuf ) , pDataFile) == NULL) 

*pCurPos = PHNSPELL_END_OF_ENTRY; 
break; 

} 

numfields = sscanf (linebuf , "%s %s %ud", spelling, phonemes, 

Scf requency) ; 

assert (strlen (spelling) < 40); 

if ( numfields < 3 j j frequency > Oxffff ) { 
^ errThrow(USE_ERR( PhnSpell, 1), linebuf); 

if (frequency ==0) // BUG KLUDGE We must not have entries in 
table with freq == 0 

frequency =3; // because a word w/ freq == 0 would cause 

log ( 0 ) eventual ly 

// Force Ascii file to be in incr alpha order on spelling, 
if (strcmp (spelling, prevSpelling) < 0) { 

errThrow(USE_ERR (PhnSpell, 2), spelling, prevSpelling); 

// Check for new spelling. 

if ( strcmp (spelling, prevSpelling) ) { 

// finish prev. pron/LM list 

*pCurPos = PHNSPELL_END_OF_ENTRY; 
pCurPos++; 

// convert to wide -char string 
size_t targetLen = strlen (spelling) ; 

wchar_t *pWCSpell = DgnNew ( wchar_t [ targetLen + 1 ] ); 
memset (pWCSpell, 0, ( targetLen+1) *sizeof (wchar_t) ) ; 
mbstowcs (pWCSpell , spelling, targetLen) ; 
wcscpy(UNS16PTOWCP(pCurPos) , pWCSpell); // start new 

spelling 

if ( spelling [0] != prevSpelling [0] ) {// enter in 
WideCharOf f setTable 

uns32 wcOffset = pCurPos - pDataBlock; 
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assert (wcOff set < Oxffff) ; 
// xprintf ("%10s %10s %5d %7d\n n , spelling, phonemes, 
frequency, wcOf fset) ; 

WideCharOf fset wideCharOf f set (pWCSpell [0] , (unsl6) 

wcOf f set) ; 

mWCOf f setTable . add (wideCharOf fset ] : 

} 

pCurPos += targetLen + 1; 
strcpy (prevSpelling, spelling) ; 
DgnDeleteArray (pWCSpell) ; 



// write another pron/LM entry 

PronOffset pronOffset = pPronTable- >getOf fset (phonemes ) ; 
*pCurPos = pronOffset; 
pCurPos++ ; 

*pCurPos = (PronOffset) frequency; 
assert (*pCurPos == frequency); 
pCurPos++ ; 

mTotalFrequency += frequency; 
mnEntries++ ; 

assert ( (pCurPos - pDataBlock) < nDataBlockSize); 
pCurPos++ ; 

*pCurPos = PHNSPELL_END_OF_ENTRY; 

mpPronData = pPronTable- >getCopyOfDataBlock () ; 
mnPronDataSize = pPronTable- >getCurrentOf fset () ; 
DgnDelete (pPronTable) ; 

xprintf ( "unsl6 PhnSpellData [] used %d of %d allocated entries . \n" , 
pCurPos - pDataBlock, nDataBlockSize) ; 



mPhnSpellDataSize = (unsl6) (pCurPos - pDataBlock); 

assert ( (unsl6) mPhnSpellDataSize == (unsl6) (pCurPos - pDataBlock)); 
mpPhnSpellData = DgnNew( unsl6 [ mPhnSpellDataSize ] ); 
memcpy( mpPhnSpellData, pDataBlock, mPhnSpellDataSize*sizeof (unsl6 ) ) 
DgnDeleteArray (pDataBlock) ; 



/*/////// Old History /////////////////////////////// 

2 3/24/97 5:04p Joel 

PHONEQUERY Ver 0.01.167 
fixes for Chuck's build for TREC okay, but we still don't use 
SDWord_Set Frequency, etc., which does not exist in TREC... 
so the TREC build currently is broken. . . Chuck can you fix this? 

I 3/24/97 16:30 Chuck 
PHONEQUERY Ver 0.01.165 
Added prons lib 

II 11/07/96 12:11 Chuck 
TAHITI Ver 0.04.390 

Now support Silence in pron guessing, if it shows up in SDResult. 
Fixed ErrThrow for unknown character in spelling. 



10 10/07/96 11:39 Chuck 
TAHITI Ver 0.04.337 



Added error throw for unknown char in spelling. 

9 9/30/96 6:36p Joel 

TAHITI Ver 0.04.317 

8 9/13/96 9:41a Chuck 

TAHITI Ver 0.04.307 

Changed word fragment names to avoid collisions with real 
words (6, a, . . . ) 

7 8/06/96 6:24p Chuck 

TAHITI Ver 0.04.252 
We now use pointers instead of offsets when reading PhnSpellArray 
Less work, easier to read, and we have assertions too. 
updated .ans to reflect bug fix in reading frequencies for 

pron-networks 

6 7/29/96 10:09a Joel 

TAHITI Ver 0.04.232 

5 7/17/96 2:32p Chuck 

TAHITI Ver 0.04.210 

Removed data members used for bookkeeping purposes, that stuff belongs 
to 

the caller now, which is in phnguess . {h, cpp} . 

4 7/10/96 3:39p Chuck 

TAHITI Ver 0.04.198 
ruleltem array is now DgnAC< SDRuleltem > and much better for it. 

3 7/09/96 11:25a Chuck 

TAHITI Ver 0.04.197 
avoid compiler bug in getGuessRule . 

2 7/08/96 8:10p Chuck 

TAHITI Ver 0.04.194 

1 5/18/96 ll:01p Tim 

Moving over from TLIB. 
$NoKeywords : $ 

Old TLIB revision history follows. 
*t lib- revision- history* 

1 phnspell.cpp 02-Feb-96, 18 : 00 : 50, l CHUCK' TAHITI Ver 0.03.222 

2 phnspell.cpp 14-Mar-96, 11 :12 :42, *CHUCK' TAHITI Ver 0.03.321 

3 phnspell.cpp 27-Mar- 96 , 10 : 47 : 00 , 1 CHUCK' TAHITI Ver 0.03.350 

4 phnspell.cpp 01 -Apr- 96, 12 : 56 : 44, v CHUCK' TAHITI Ver 0.03.363 

5 phnspell.cpp 08-Apr- 96 , 09 : 18 : 10 , » CHUCK' TAHITI Ver C 33.375 

6 phnspell.cpp 10-Apr-96 , 18 : 56 : 54 , * STAN' TAHITI Ver 0.03.382 

7 PHNSPELL.CPP 17 -Apr- 96 , 09 : 3 0 : 08 , * BENT ' TAHITI Ver 0.03.402 

8 phnspell.cpp 17-Apr- 96 , 14 : 33 : 26 , *HUGH' TAHITI Ver 0.03.404 

9 phnspell.cpp 23-Apr-96 , 18 : 04 : 58 , * STAN' TAHITI Ver 0.04.020 

10 phnspell.cpp 23 -Apr- 96 , 19 : 40 : 54 , % STAN' TAHITI Ver 0.04.021 

11 phnspell.cpp 25-Apr-96 , 13 : 19 : 14 , x STAN' TAHITI Ver 0.04.028 

12 phnspell.cpp 1 7 -May- 96,20:03:48, 1 CHUCK ' TAHITI Ver 0.04.097 

13 PHNSPELL.CPP 18 -May- 96 , 18 : 54 : 52 , 1 TIM' TAHITI Ver 0.04.100 
*t lib- revision-history* 

Revision 13 on Sat May 18 18:54:36 1996 by tim TAHITI Ver 0.04.100 

Revision 12 on Fri May 17 20:03:46 1996 by Chuck TAH:*'?I Ver 0.04.097 
Redesigning interface. . . 
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Revision 11 on Thu Apr 25 13:19:11 1996 by stan TAHITI Ver 0.04.028 
Backed off get-printf -out-of -low-level changes. 

Revision 10 on Tue Apr 23 19:40:52 1996 by stan TAHITI Ver 0.04.021 

Revision 9 on Tue Apr 23 18:04:54 1996 by stan TAHITI Ver 0.04.020 

Revision 8 on Wed Apr 17 14:33:27 1996 by Hugh TAHITJ Ver 0.03.404 
Project version of SDState_IterateWords ( ) is now used 

Revision 7 on Wed Apr 17 09:30:03 1996 by Bent TAHITI Ver 0.03.402 

Revision 6 on Wed Apr 10 18:56:50 1996 by stan TAHITI Ver 0.03.382 

Revision 5 on Mon Apr 08 09:18:08 1996 by Chuck TAHITI Ver 0.03.375 
Support for persistant PhnSpellArray object. 
Added errThrow(s) to code. 
Added read/write binary file functions. 

Revision 4 on Mon Apr 01 12:56:42 1996 by Chuck TAHITI Ver 0.03.363 
Support for gudtest and instrumentation for built/<?ict words 

Revision 3 on Wed Mar 27 10:46:58 1996 by Chuck TAHITI Ver 0.03.350 



Revision 2 on Thu Mar 14 11:12:40 1996 by Chuck TAHITI Ver 0.03.321 
Revision 1 on Fri Feb 02 18:00:48 1996 by Chuck TAHITI Ver 0.03.222 

// 

//////////////////////////////////////////////////////////////////////////// 
*/ 
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